{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "d612c80f-086a-4f36-932c-1d16875c9be7",
   "metadata": {},
   "outputs": [],
   "source": [
    "%run -i \"../util/lang_utils.ipynb\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1ea163f1-4ce3-429d-b3a5-b500a1863137",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import Levenshtein"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "d6a8d788-63d5-4c37-ae19-483435ec3877",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_file = \"../data/DataScientist.csv\"\n",
    "df = pd.read_csv(data_file, encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "aeca3661-c42a-49c2-9731-d5fcd34d5ae2",
   "metadata": {},
   "outputs": [],
   "source": [
    "emails = get_emails(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "95f54479-de45-4560-9f43-03b16c804af7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_levenshtein(input_string, df):\n",
    "    df['distance_to_' + input_string] = df['emails'].apply(lambda x: Levenshtein.distance(input_string, x))\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "79ab7372-d7d3-489f-8cff-23ba58d7e96a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_closest_email_lev(df, email):\n",
    "    df = find_levenshtein(email, df)\n",
    "    column_name = 'distance_to_' + email\n",
    "    minimum_value_email_index = df[column_name].idxmin()\n",
    "    email = df.loc[minimum_value_email_index]['emails']\n",
    "    return email"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "d6d7cae7-9d9a-4e03-bcac-0b127500b9ea",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "rohit.mcdonald@prolim.com\n"
     ]
    }
   ],
   "source": [
    "new_df = pd.DataFrame(emails,columns=['emails'])\n",
    "input_string = \"rohitt.macdonald@prelim.com\"\n",
    "email = get_closest_email_lev(new_df, input_string)\n",
    "print(email)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "6acd8a50-3aaf-46b1-bb01-d8dc61bf5545",
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_jaro(input_string, df):\n",
    "    df['distance_to_' + input_string] = df['emails'].apply(lambda x: Levenshtein.jaro(input_string, x))\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "54de2f41-374e-41cf-807e-f9124dc5b065",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_closest_email_jaro(df, email):\n",
    "    df = find_jaro(email, df)\n",
    "    column_name = 'distance_to_' + email\n",
    "    maximum_value_email_index = df[column_name].idxmax()\n",
    "    email = df.loc[maximum_value_email_index]['emails']\n",
    "    return email "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "6054cbf9-9d9b-41ad-a813-ae8206f7d952",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "rohit.mcdonald@prolim.com\n"
     ]
    }
   ],
   "source": [
    "email = get_closest_email_jaro(new_df, input_string)\n",
    "print(email)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1360da33-df04-40c1-9813-1946b301c5f2",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
